Required Packages

library(readr)
library(plotly)
library(tidyverse)
library(lubridate)
library(MASS)
library(mice)
library(ROSE)
library(caret)
library(randomForest)
library(e1071)
library(partykit)
library(nnet)
library(revgeo)
library(opencage)
library(rgeos)
library(ggforce)
library(ggplot2)
library(ggmap)
library(DT)
library(viridis)
library(rpart.plot)
library(patchwork)

Reading data and cleaning

az_me <- read_csv("az_mesa_2019_12_17.csv")
population<-read_csv("Population.csv")
zipcode<-read_csv("zipcode.csv")
az_me <- az_me[,c("date","time","subject_age","subject_race","subject_sex","violation","arrest_made","lng","lat","type")]
az_me<- az_me %>% filter( type=="vehicular" & subject_race !="unknown" )

az_me<-na.omit(az_me)

Preliminary Data Analysis

Trend for stopping accross mesa

graph<-az_me %>%
  group_by(Year=year(date),subject_race)%>%count() 
graph <- inner_join(graph,population,by=c('Year','subject_race')) 

graph<-graph %>% mutate(percentage=n*100/Total)

g1<-ggplot(graph, aes(
  x = Year,
  y = n,
  col= subject_race
  )) +
  geom_line() +
  geom_point() +
  xlab("Year") +
  ylab("Total Number of cases") +
  theme(axis.text.x = element_text(face = "bold", size = 10, angle = 45))
g2<-ggplot(graph, aes(
  x = Year,
  y = percentage,
  col= subject_race
  )) +
  geom_line() +
  geom_point() +
  xlab("Year") +
  ylab("Percentage of population by race") +
  theme(axis.text.x = element_text(face = "bold", size = 10, angle = 45))

combined <- g1 + g2 & theme(legend.position = "right") 
combined + plot_layout(guides = "collect")

Trend for arresting accross mesa

graph<-az_me %>% filter(arrest_made==TRUE) %>%
  group_by(Year=year(date),subject_race)%>%count() 
graph <- inner_join(graph,population,by=c('Year','subject_race')) 

graph<-graph %>% mutate(percentage=n*100/Total)

g1<-ggplot(graph, aes(
  x = Year,
  y = n,
  col= subject_race
  )) +
  geom_line() +
  geom_point() +
  xlab("Year") +
  ylab("Total Number of cases") +
  theme(axis.text.x = element_text(face = "bold", size = 10, angle = 45))
g2<-ggplot(graph, aes(
  x = Year,
  y = percentage,
  col= subject_race
  )) +
  geom_line() +
  geom_point() +
  xlab("Year") +
  ylab("Percentage of population by race") +
  theme(axis.text.x = element_text(face = "bold", size = 10, angle = 45))

combined <- g1 + g2 & theme(legend.position = "right") 
combined + plot_layout(guides = "collect")

Total number of cases: Hour-wise

az_me_age <- az_me %>%
mutate(age_level=cut(az_me$subject_age, breaks =c(15,30,50,81), labels=c('15-30','30-50','>50')))
az_me_age$subject_race=ifelse(az_me_age$subject_race=='unknown',NA,az_me_age$subject_race)
az_me_age<-na.omit(az_me_age)

graph_time<-az_me_age %>%
  group_by(hour=as.factor(hour(time)),age_level)%>%count()

g2<-ggplot(na.omit(graph_time), aes(
  x = hour,
  y = n,
  fill= age_level
  )) +
  geom_col() +
  xlab("Hour") +
  ylab("Count") +
  theme(axis.text.x = element_text(face = "bold", size = 10, angle = 45))+
  scale_fill_brewer(palette = "Dark2") 
ggplotly(g2)

Total number of cases: Day - Wise

graph_day<-az_me_age %>%
  group_by(day=as.factor(weekdays(as.Date(date))),age_level)%>%count()
# graph_day$subject_race <- fct_relevel(as.factor(graph_day$subject_race),'white',after=0L)

g3<-ggplot(graph_day, aes(
  x = reorder(day,n),
  y = n,
  fill= age_level
  )) +
  geom_col(width = 0.5) +
  xlab("Days") +
  ylab("Count")+
  theme(axis.text.x = element_text(face = "bold", size = 10, angle = 45))+
  scale_fill_brewer(palette = "Dark2") 
ggplotly(g3)

Total number of vehicles stopped vs Reason for being stopped

violation<-az_me%>%filter(type=="vehicular") %>% group_by(violation)%>%summarise(count=n())
violation <- na.omit(violation)
violation<-violation[order(violation$count,decreasing = TRUE),]

df3<-az_me_age %>%
  filter(type=="vehicular") %>%
      group_by(violation,age_level) %>%
      summarise(count=n())%>%
      arrange(desc(count))
    
    df3<-na.omit(df3)
    df3$violation<-as.factor(df3$violation)
    df4<-df3[1:15,]
    g4<-ggplot(data=df4, aes(x=reorder(violation,age_level),y=count,fill=age_level)) +
      geom_bar(stat = "identity",position = "dodge") +
      coord_flip()+ 
      xlab("Reason for being stopped") +
      ylab("Total number of vehicles stopped")
      
 ggplotly(g4)

Model fitting using caret package

Selecting only arrest data

az_me_age_model<-az_me %>% filter( arrest_made == TRUE)

Imputing zipcode which is generated from latitude and longitude values which will be a predictor for my model

# Code for generating zipcode is commented because it gets stuck while knitting the file. so I have written the file to directory and read in to this rmd separately
# x<-revgeo(az_me_age_model$lng, az_me_age_model$lat,output = "frame")$zip
# y<-str_replace(x, ".*\\b(\\d{5})\\b.*", "\\1")
# write.csv(y,"~/Project/zipcode.csv", row.names = FALSE)
az_me_age_model$zipcode<-zipcode$x
az_me_age_model[az_me_age_model$zipcode=="Postcode Not Found",]$zipcode=NA
az_me_age_model<-na.omit(az_me_age_model)

Dividing violations in 5 broad categories which act as response for my model

az_me_age_model$viotype <-
  ifelse(
    grepl("DRUGS|LIQUOR|PHONE", az_me_age_model$violation),
    "distracted_driving",
       ifelse(
        grepl("DEVICE|LIGHT|STOP|LIGHTS|LAMPS", az_me_age_model$violation),
        "running_red_light_or_without_headlight",
        ifelse(
          grepl(
            "TURN|SPEED|MPH|FOLLOWING|AGGRESSIVE |POLICE|DRIVEWAY|JAYWALKING|PEDESTRIAN|RECKLESS|SIDEWALK|PSS|LEFT|RIGHT|SIGNAL|LANE|UNSAFE|EMERGENCY",
            az_me_age_model$violation
          ),
          "reckless_driving",
          ifelse(
            grepl(
              "ACCIDENT|QUICK|UNATT|INFO|REPORT|DAMAGE|RELEASE",
              az_me_age_model$violation
            ),
            "leaving_scene_of_an_accident_informing",
            "without_proper_documents_or_vehicle_defects"
          )
        )
      )
    )
  
  
az_me_age_model<-az_me_age_model %>%
  mutate(subject_race=as.factor(subject_race),
         subject_sex=as.factor(subject_sex),
         viotype=as.factor(viotype),
         zipcode=as.factor(zipcode))

Types of Models fitted

Multinomial Model

model<- viotype~subject_age+subject_race+subject_sex+zipcode

#Multinomial model with up sampling
ctrl <- trainControl(method = "repeatedcv", 
                     number = 10,
                     sampling = "up")

set.seed(42)
model_rf_over <- caret::train(model,
                              data = az_me_age_model,
                              method = "multinom",
                              preProcess = c("scale", "center"),
                              trControl = ctrl,
                              tuneLength = 5)
x<-predict(model_rf_over, newdata = az_me_age_model, type = "raw")
result<- confusionMatrix(x,az_me_age_model$viotype)
result
## Confusion Matrix and Statistics
## 
##                                              Reference
## Prediction                                    distracted_driving
##   distracted_driving                                          22
##   leaving_scene_of_an_accident_informing                       7
##   reckless_driving                                             9
##   running_red_light_or_without_headlight                       8
##   without_proper_documents_or_vehicle_defects                 12
##                                              Reference
## Prediction                                    leaving_scene_of_an_accident_informing
##   distracted_driving                                                               1
##   leaving_scene_of_an_accident_informing                                          11
##   reckless_driving                                                                 1
##   running_red_light_or_without_headlight                                           2
##   without_proper_documents_or_vehicle_defects                                      1
##                                              Reference
## Prediction                                    reckless_driving
##   distracted_driving                                        28
##   leaving_scene_of_an_accident_informing                    31
##   reckless_driving                                          29
##   running_red_light_or_without_headlight                    36
##   without_proper_documents_or_vehicle_defects               24
##                                              Reference
## Prediction                                    running_red_light_or_without_headlight
##   distracted_driving                                                              20
##   leaving_scene_of_an_accident_informing                                          19
##   reckless_driving                                                                16
##   running_red_light_or_without_headlight                                          50
##   without_proper_documents_or_vehicle_defects                                     19
##                                              Reference
## Prediction                                    without_proper_documents_or_vehicle_defects
##   distracted_driving                                                                  112
##   leaving_scene_of_an_accident_informing                                               52
##   reckless_driving                                                                     87
##   running_red_light_or_without_headlight                                              104
##   without_proper_documents_or_vehicle_defects                                         169
## 
## Overall Statistics
##                                          
##                Accuracy : 0.323          
##                  95% CI : (0.292, 0.3552)
##     No Information Rate : 0.6023         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.1175         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
## 
## Statistics by Class:
## 
##                      Class: distracted_driving
## Sensitivity                            0.37931
## Specificity                            0.80172
## Pos Pred Value                         0.12022
## Neg Pred Value                         0.94760
## Prevalence                             0.06667
## Detection Rate                         0.02529
## Detection Prevalence                   0.21034
## Balanced Accuracy                      0.59052
##                      Class: leaving_scene_of_an_accident_informing
## Sensitivity                                                0.68750
## Specificity                                                0.87237
## Pos Pred Value                                             0.09167
## Neg Pred Value                                             0.99333
## Prevalence                                                 0.01839
## Detection Rate                                             0.01264
## Detection Prevalence                                       0.13793
## Balanced Accuracy                                          0.77993
##                      Class: reckless_driving
## Sensitivity                          0.19595
## Specificity                          0.84349
## Pos Pred Value                       0.20423
## Neg Pred Value                       0.83654
## Prevalence                           0.17011
## Detection Rate                       0.03333
## Detection Prevalence                 0.16322
## Balanced Accuracy                    0.51972
##                      Class: running_red_light_or_without_headlight
## Sensitivity                                                0.40323
## Specificity                                                0.79893
## Pos Pred Value                                             0.25000
## Neg Pred Value                                             0.88955
## Prevalence                                                 0.14253
## Detection Rate                                             0.05747
## Detection Prevalence                                       0.22989
## Balanced Accuracy                                          0.60108
##                      Class: without_proper_documents_or_vehicle_defects
## Sensitivity                                                      0.3225
## Specificity                                                      0.8382
## Pos Pred Value                                                   0.7511
## Neg Pred Value                                                   0.4496
## Prevalence                                                       0.6023
## Detection Rate                                                   0.1943
## Detection Prevalence                                             0.2586
## Balanced Accuracy                                                0.5803
#For combinig the Recall Precision to calculate F1 score
get.macro.f1 <- function(cm) {
    c <- cm$byClass # a single matrix is sufficient
    re <- sum(c[, "Recall"]) / nrow(c)
    pr <- sum(c[, "Precision"]) / nrow(c)
    f1 <- 2 * ((re * pr) / (re + pr))
    ac<-cm$overall["Accuracy"]
    f2<-c(ac,re,pr,f1)
    return(f2)
}
macro.mul <- get.macro.f1(result)

Decision tree

ctrl <- trainControl(method = "repeatedcv", 
                     number = 10,
                     repeats = 3,
                     sampling = "up")

set.seed(42)
model_rf_over <- caret::train(model,
                              data = az_me_age_model,
                              method = "rpart",
                              parms = list(split = "information"),
                              tuneLength = 10,
                              trControl = ctrl)

x<-predict(model_rf_over, newdata = az_me_age_model, type = "raw")
result<-confusionMatrix(x,az_me_age_model$viotype)
result
## Confusion Matrix and Statistics
## 
##                                              Reference
## Prediction                                    distracted_driving
##   distracted_driving                                          52
##   leaving_scene_of_an_accident_informing                       1
##   reckless_driving                                             0
##   running_red_light_or_without_headlight                       3
##   without_proper_documents_or_vehicle_defects                  2
##                                              Reference
## Prediction                                    leaving_scene_of_an_accident_informing
##   distracted_driving                                                               0
##   leaving_scene_of_an_accident_informing                                          16
##   reckless_driving                                                                 0
##   running_red_light_or_without_headlight                                           0
##   without_proper_documents_or_vehicle_defects                                      0
##                                              Reference
## Prediction                                    reckless_driving
##   distracted_driving                                        17
##   leaving_scene_of_an_accident_informing                     5
##   reckless_driving                                          75
##   running_red_light_or_without_headlight                    28
##   without_proper_documents_or_vehicle_defects               23
##                                              Reference
## Prediction                                    running_red_light_or_without_headlight
##   distracted_driving                                                              14
##   leaving_scene_of_an_accident_informing                                           4
##   reckless_driving                                                                 8
##   running_red_light_or_without_headlight                                          87
##   without_proper_documents_or_vehicle_defects                                     11
##                                              Reference
## Prediction                                    without_proper_documents_or_vehicle_defects
##   distracted_driving                                                                   95
##   leaving_scene_of_an_accident_informing                                               18
##   reckless_driving                                                                     77
##   running_red_light_or_without_headlight                                               92
##   without_proper_documents_or_vehicle_defects                                         242
## 
## Overall Statistics
##                                          
##                Accuracy : 0.5425         
##                  95% CI : (0.5088, 0.576)
##     No Information Rate : 0.6023         
##     P-Value [Acc > NIR] : 0.9998         
##                                          
##                   Kappa : 0.371          
##                                          
##  Mcnemar's Test P-Value : <2e-16         
## 
## Statistics by Class:
## 
##                      Class: distracted_driving
## Sensitivity                            0.89655
## Specificity                            0.84483
## Pos Pred Value                         0.29213
## Neg Pred Value                         0.99133
## Prevalence                             0.06667
## Detection Rate                         0.05977
## Detection Prevalence                   0.20460
## Balanced Accuracy                      0.87069
##                      Class: leaving_scene_of_an_accident_informing
## Sensitivity                                                1.00000
## Specificity                                                0.96721
## Pos Pred Value                                             0.36364
## Neg Pred Value                                             1.00000
## Prevalence                                                 0.01839
## Detection Rate                                             0.01839
## Detection Prevalence                                       0.05057
## Balanced Accuracy                                          0.98361
##                      Class: reckless_driving
## Sensitivity                          0.50676
## Specificity                          0.88227
## Pos Pred Value                       0.46875
## Neg Pred Value                       0.89718
## Prevalence                           0.17011
## Detection Rate                       0.08621
## Detection Prevalence                 0.18391
## Balanced Accuracy                    0.69451
##                      Class: running_red_light_or_without_headlight
## Sensitivity                                                 0.7016
## Specificity                                                 0.8351
## Pos Pred Value                                              0.4143
## Neg Pred Value                                              0.9439
## Prevalence                                                  0.1425
## Detection Rate                                              0.1000
## Detection Prevalence                                        0.2414
## Balanced Accuracy                                           0.7684
##                      Class: without_proper_documents_or_vehicle_defects
## Sensitivity                                                      0.4618
## Specificity                                                      0.8960
## Pos Pred Value                                                   0.8705
## Neg Pred Value                                                   0.5236
## Prevalence                                                       0.6023
## Detection Rate                                                   0.2782
## Detection Prevalence                                             0.3195
## Balanced Accuracy                                                0.6789
get.macro.f1 <- function(cm) {
    c <- cm$byClass # a single matrix is sufficient
    re <- sum(c[, "Recall"]) / nrow(c)
    pr <- sum(c[, "Precision"]) / nrow(c)
    f1 <- 2 * ((re * pr) / (re + pr))
    ac<-cm$overall["Accuracy"]
    f2<-c(ac,re,pr,f1)
    return(f2)
}
macro.dt <- get.macro.f1(result)

Random Forest

ctrl <- trainControl(method = "oob", 
                     number = 10,
                     sampling = "up")

set.seed(42)
model_rf_over_rf <- caret::train(model,
                              data = az_me_age_model,
                              method = "rf",
                              trControl = ctrl)


x<-predict(model_rf_over_rf, newdata = az_me_age_model, type = "raw")
result<-confusionMatrix(x,az_me_age_model$viotype)
result
## Confusion Matrix and Statistics
## 
##                                              Reference
## Prediction                                    distracted_driving
##   distracted_driving                                          58
##   leaving_scene_of_an_accident_informing                       0
##   reckless_driving                                             0
##   running_red_light_or_without_headlight                       0
##   without_proper_documents_or_vehicle_defects                  0
##                                              Reference
## Prediction                                    leaving_scene_of_an_accident_informing
##   distracted_driving                                                               0
##   leaving_scene_of_an_accident_informing                                          16
##   reckless_driving                                                                 0
##   running_red_light_or_without_headlight                                           0
##   without_proper_documents_or_vehicle_defects                                      0
##                                              Reference
## Prediction                                    reckless_driving
##   distracted_driving                                        11
##   leaving_scene_of_an_accident_informing                     1
##   reckless_driving                                         126
##   running_red_light_or_without_headlight                     8
##   without_proper_documents_or_vehicle_defects                2
##                                              Reference
## Prediction                                    running_red_light_or_without_headlight
##   distracted_driving                                                               6
##   leaving_scene_of_an_accident_informing                                           3
##   reckless_driving                                                                 5
##   running_red_light_or_without_headlight                                         109
##   without_proper_documents_or_vehicle_defects                                      1
##                                              Reference
## Prediction                                    without_proper_documents_or_vehicle_defects
##   distracted_driving                                                                   31
##   leaving_scene_of_an_accident_informing                                                6
##   reckless_driving                                                                     38
##   running_red_light_or_without_headlight                                               35
##   without_proper_documents_or_vehicle_defects                                         414
## 
## Overall Statistics
##                                           
##                Accuracy : 0.831           
##                  95% CI : (0.8044, 0.8554)
##     No Information Rate : 0.6023          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7379          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: distracted_driving
## Sensitivity                            1.00000
## Specificity                            0.94089
## Pos Pred Value                         0.54717
## Neg Pred Value                         1.00000
## Prevalence                             0.06667
## Detection Rate                         0.06667
## Detection Prevalence                   0.12184
## Balanced Accuracy                      0.97044
##                      Class: leaving_scene_of_an_accident_informing
## Sensitivity                                                1.00000
## Specificity                                                0.98829
## Pos Pred Value                                             0.61538
## Neg Pred Value                                             1.00000
## Prevalence                                                 0.01839
## Detection Rate                                             0.01839
## Detection Prevalence                                       0.02989
## Balanced Accuracy                                          0.99415
##                      Class: reckless_driving
## Sensitivity                           0.8514
## Specificity                           0.9404
## Pos Pred Value                        0.7456
## Neg Pred Value                        0.9686
## Prevalence                            0.1701
## Detection Rate                        0.1448
## Detection Prevalence                  0.1943
## Balanced Accuracy                     0.8959
##                      Class: running_red_light_or_without_headlight
## Sensitivity                                                 0.8790
## Specificity                                                 0.9424
## Pos Pred Value                                              0.7171
## Neg Pred Value                                              0.9791
## Prevalence                                                  0.1425
## Detection Rate                                              0.1253
## Detection Prevalence                                        0.1747
## Balanced Accuracy                                           0.9107
##                      Class: without_proper_documents_or_vehicle_defects
## Sensitivity                                                      0.7901
## Specificity                                                      0.9913
## Pos Pred Value                                                   0.9928
## Neg Pred Value                                                   0.7572
## Prevalence                                                       0.6023
## Detection Rate                                                   0.4759
## Detection Prevalence                                             0.4793
## Balanced Accuracy                                                0.8907
get.macro.f1 <- function(cm) {
    c <- cm$byClass # a single matrix is sufficient
    re <- sum(c[, "Recall"]) / nrow(c)
    pr <- sum(c[, "Precision"]) / nrow(c)
    f1 <- 2 * ((re * pr) / (re + pr))
    ac<-cm$overall["Accuracy"]
    f2<-c(ac,re,pr,f1)
    return(f2)
}
macro.rand <- get.macro.f1(result)

Neural Net

ctrl <- trainControl(method = "repeatedcv", 
                     number = 10, 
                     verboseIter = FALSE,
                     sampling = "up")

set.seed(42)
model_rf_over <- caret::train(model,
                              data = az_me_age_model,
                              method = "nnet",
                              maxit = 1000,
                              trControl = ctrl)
x<-predict(model_rf_over, newdata = az_me_age_model, type = "raw")
result<-confusionMatrix(x,az_me_age_model$viotype)
result
## Confusion Matrix and Statistics
## 
##                                              Reference
## Prediction                                    distracted_driving
##   distracted_driving                                          43
##   leaving_scene_of_an_accident_informing                       1
##   reckless_driving                                             2
##   running_red_light_or_without_headlight                       9
##   without_proper_documents_or_vehicle_defects                  3
##                                              Reference
## Prediction                                    leaving_scene_of_an_accident_informing
##   distracted_driving                                                               0
##   leaving_scene_of_an_accident_informing                                          16
##   reckless_driving                                                                 0
##   running_red_light_or_without_headlight                                           0
##   without_proper_documents_or_vehicle_defects                                      0
##                                              Reference
## Prediction                                    reckless_driving
##   distracted_driving                                        38
##   leaving_scene_of_an_accident_informing                     7
##   reckless_driving                                          51
##   running_red_light_or_without_headlight                    40
##   without_proper_documents_or_vehicle_defects               12
##                                              Reference
## Prediction                                    running_red_light_or_without_headlight
##   distracted_driving                                                              30
##   leaving_scene_of_an_accident_informing                                           6
##   reckless_driving                                                                45
##   running_red_light_or_without_headlight                                          35
##   without_proper_documents_or_vehicle_defects                                      8
##                                              Reference
## Prediction                                    without_proper_documents_or_vehicle_defects
##   distracted_driving                                                                  193
##   leaving_scene_of_an_accident_informing                                               20
##   reckless_driving                                                                    184
##   running_red_light_or_without_headlight                                               80
##   without_proper_documents_or_vehicle_defects                                          47
## 
## Overall Statistics
##                                           
##                Accuracy : 0.2207          
##                  95% CI : (0.1935, 0.2497)
##     No Information Rate : 0.6023          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0779          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: distracted_driving
## Sensitivity                            0.74138
## Specificity                            0.67857
## Pos Pred Value                         0.14145
## Neg Pred Value                         0.97350
## Prevalence                             0.06667
## Detection Rate                         0.04943
## Detection Prevalence                   0.34943
## Balanced Accuracy                      0.70998
##                      Class: leaving_scene_of_an_accident_informing
## Sensitivity                                                1.00000
## Specificity                                                0.96019
## Pos Pred Value                                             0.32000
## Neg Pred Value                                             1.00000
## Prevalence                                                 0.01839
## Detection Rate                                             0.01839
## Detection Prevalence                                       0.05747
## Balanced Accuracy                                          0.98009
##                      Class: reckless_driving
## Sensitivity                          0.34459
## Specificity                          0.68006
## Pos Pred Value                       0.18085
## Neg Pred Value                       0.83503
## Prevalence                           0.17011
## Detection Rate                       0.05862
## Detection Prevalence                 0.32414
## Balanced Accuracy                    0.51232
##                      Class: running_red_light_or_without_headlight
## Sensitivity                                                0.28226
## Specificity                                                0.82708
## Pos Pred Value                                             0.21341
## Neg Pred Value                                             0.87394
## Prevalence                                                 0.14253
## Detection Rate                                             0.04023
## Detection Prevalence                                       0.18851
## Balanced Accuracy                                          0.55467
##                      Class: without_proper_documents_or_vehicle_defects
## Sensitivity                                                     0.08969
## Specificity                                                     0.93353
## Pos Pred Value                                                  0.67143
## Neg Pred Value                                                  0.40375
## Prevalence                                                      0.60230
## Detection Rate                                                  0.05402
## Detection Prevalence                                            0.08046
## Balanced Accuracy                                               0.51161
get.macro.f1 <- function(cm) {
    c <- cm$byClass # a single matrix is sufficient
    re <- sum(c[, "Recall"]) / nrow(c)
    pr <- sum(c[, "Precision"]) / nrow(c)
    f1 <- 2 * ((re * pr) / (re + pr))
    ac<-cm$overall["Accuracy"]
    f2<-c(ac,re,pr,f1)
    return(f2)
}
macro.nn <- get.macro.f1(result)

Knn

ctrl <- trainControl(method = "repeatedcv", 
                     repeats = 3,
                     sampling = "up")

set.seed(42)
model_rf_over <- caret::train(model,
                              data = az_me_age_model,
                              method = "knn",
                              preProcess = c("scale", "center"),
                              tuneLength = 20,
                              trControl = ctrl)

x<-predict(model_rf_over, newdata = az_me_age_model, type = "raw")
result<-confusionMatrix(x,az_me_age_model$viotype)
result
## Confusion Matrix and Statistics
## 
##                                              Reference
## Prediction                                    distracted_driving
##   distracted_driving                                          57
##   leaving_scene_of_an_accident_informing                       0
##   reckless_driving                                             0
##   running_red_light_or_without_headlight                       1
##   without_proper_documents_or_vehicle_defects                  0
##                                              Reference
## Prediction                                    leaving_scene_of_an_accident_informing
##   distracted_driving                                                               0
##   leaving_scene_of_an_accident_informing                                          16
##   reckless_driving                                                                 0
##   running_red_light_or_without_headlight                                           0
##   without_proper_documents_or_vehicle_defects                                      0
##                                              Reference
## Prediction                                    reckless_driving
##   distracted_driving                                        18
##   leaving_scene_of_an_accident_informing                     6
##   reckless_driving                                          93
##   running_red_light_or_without_headlight                    20
##   without_proper_documents_or_vehicle_defects               11
##                                              Reference
## Prediction                                    running_red_light_or_without_headlight
##   distracted_driving                                                               9
##   leaving_scene_of_an_accident_informing                                           4
##   reckless_driving                                                                14
##   running_red_light_or_without_headlight                                          93
##   without_proper_documents_or_vehicle_defects                                      4
##                                              Reference
## Prediction                                    without_proper_documents_or_vehicle_defects
##   distracted_driving                                                                  103
##   leaving_scene_of_an_accident_informing                                               16
##   reckless_driving                                                                    122
##   running_red_light_or_without_headlight                                              124
##   without_proper_documents_or_vehicle_defects                                         159
## 
## Overall Statistics
##                                           
##                Accuracy : 0.4805          
##                  95% CI : (0.4468, 0.5143)
##     No Information Rate : 0.6023          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3344          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: distracted_driving
## Sensitivity                            0.98276
## Specificity                            0.83990
## Pos Pred Value                         0.30481
## Neg Pred Value                         0.99854
## Prevalence                             0.06667
## Detection Rate                         0.06552
## Detection Prevalence                   0.21494
## Balanced Accuracy                      0.91133
##                      Class: leaving_scene_of_an_accident_informing
## Sensitivity                                                1.00000
## Specificity                                                0.96956
## Pos Pred Value                                             0.38095
## Neg Pred Value                                             1.00000
## Prevalence                                                 0.01839
## Detection Rate                                             0.01839
## Detection Prevalence                                       0.04828
## Balanced Accuracy                                          0.98478
##                      Class: reckless_driving
## Sensitivity                           0.6284
## Specificity                           0.8116
## Pos Pred Value                        0.4061
## Neg Pred Value                        0.9142
## Prevalence                            0.1701
## Detection Rate                        0.1069
## Detection Prevalence                  0.2632
## Balanced Accuracy                     0.7200
##                      Class: running_red_light_or_without_headlight
## Sensitivity                                                 0.7500
## Specificity                                                 0.8056
## Pos Pred Value                                              0.3908
## Neg Pred Value                                              0.9509
## Prevalence                                                  0.1425
## Detection Rate                                              0.1069
## Detection Prevalence                                        0.2736
## Balanced Accuracy                                           0.7778
##                      Class: without_proper_documents_or_vehicle_defects
## Sensitivity                                                      0.3034
## Specificity                                                      0.9566
## Pos Pred Value                                                   0.9138
## Neg Pred Value                                                   0.4756
## Prevalence                                                       0.6023
## Detection Rate                                                   0.1828
## Detection Prevalence                                             0.2000
## Balanced Accuracy                                                0.6300
get.macro.f1 <- function(cm) {
    c <- cm$byClass # a single matrix is sufficient
    re <- sum(c[, "Recall"]) / nrow(c)
    pr <- sum(c[, "Precision"]) / nrow(c)
    f1 <- 2 * ((re * pr) / (re + pr))
    ac<-cm$overall["Accuracy"]
    f2<-c(ac,re,pr,f1)
    return(f2)
}
macro.knn <- get.macro.f1(result)
rbind(macro.mul,macro.dt,macro.rand,macro.nn,macro.knn)
##             Accuracy                              
## macro.mul  0.3229885 0.3977002 0.2834443 0.3309896
## macro.dt   0.5425287 0.7133507 0.4818621 0.5751891
## macro.rand 0.8310345 0.9040920 0.7236055 0.8038422
## macro.nn   0.2206897 0.4915853 0.3054283 0.3767667
## macro.knn  0.4804598 0.7329144 0.4792856 0.5795666

Map visualizations

Model Output with max probability violation

#Importing map for mesa city
register_google(key = "AIzaSyCvgTagZzhtN_1FiBYEQy29kgkSYgxEAao", write = TRUE)
mesa <- ggmap(get_map(c(left=-111.9363,bottom=33.27729,right=-111.5822,top=33.49722)))

#Predicted probability percentages for getting arrested for different  violation types at different locations.
x<-predict(model_rf_over_rf, newdata = az_me_age_model, type = "prob")


#combining predicted values with original data set.
n<-nrow(x)
x$obs<-1:n
x_long <- x %>% pivot_longer(-obs,
                       names_to = "violation_type",
                       values_to = "prob")
x_max <- x_long %>%
            group_by(obs) %>%
              summarise(max_prob = max(prob),
                        violation_type = violation_type[which(prob == max_prob)])


az_me_age_model$obs<-c(1:n)
az_me_age_model<-inner_join(az_me_age_model,x_max,by="obs")
az_me_age_model_shiny<-inner_join(az_me_age_model,x_long,by="obs")

az_me_age_model <- az_me_age_model %>%
  arrange(obs) %>%
  mutate( violation_type=factor(violation_type, unique(violation_type))) %>%
  mutate( mytext=paste(
    "Violation Type: ", violation_type, "\n", 
    "Percentage of getting arrested for this violation: ", formatC(max_prob*100), sep="")
  )

shiny_data<- az_me_age_model_shiny %>% group_by(violation_type.y,subject_race,subject_sex) %>% distinct()
shiny_data$violation_type<- as.factor(shiny_data$violation_type.y)
shiny_data<-shiny_data[,c("subject_age","subject_race","subject_sex","lng","lat","violation_type","prob")]
shiny_data<-unique(shiny_data)
write.csv(shiny_data,"~/shiny_data.csv", row.names = FALSE)

# Make the map (static)
p <- mesa +
  geom_point(
    aes(
      x = lng,
      y = lat,
      text = mytext,
      color =violation_type
    ),
    data = az_me_age_model
  ) +
  theme_void() +
  ggtitle("Probability of arrest violation types across the MESA(AZ state)")+
  theme_bw() + theme(axis.title.x=element_blank(),
                     axis.title.y=element_blank(),
                     panel.border = element_blank())

p <- ggplotly(p,tooltip = "text")
p

Heat map for number of arrests which are predicted from the model

az_me_age_model$lon<-az_me_age_model$lng
mesa +
  coord_equal() + 
  xlab('Longitude') + 
  ylab('Latitude') + 
  stat_density2d(aes(fill = ..level..), alpha = .3,
                 geom = "polygon", data = az_me_age_model,show.legend=TRUE) + 
  scale_fill_viridis_c() + 
  theme(axis.line=element_blank(),axis.text.x=element_blank(),
        axis.text.y=element_blank(),axis.ticks=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank())+ labs(fill = "Number of Arrest Cases")+
  facet_wrap(~violation_type)